# Loading packages
library(readtext)
library(readxl)
library(dplyr)
library(tidyverse)
library(tidytext)
library(tm)
library(textstem)
library(wordcloud)
library(slam)
library(topicmodels)
library(SentimentAnalysis)

Introduction

The main objective of this paper is to analyse the Federal Open Market Commitee statements using text mining methods and tools provided by R. We start with basic analysis of (….)

What is Federal Open Market Committee?

Federal Open Market Committee (FOMC) is the body of the central bank of United States (the Federal Reserve System). Its main duties is setting the national monetary policy. The FOMC holds eight regularly scheduled meetings per year. At these meetings, the Committee reviews economic and financial conditions, determines the appropriate stance of monetary policy, and assesses the risks to its long-run goals of price stability and sustainable economic growth. The FOMC consists of 12 voting members: seven members of the Board of Governors, the president of the Federal Reserve Bank of New York and 4 of the remaining 11 Reserve Bank presidents, who serve one-year terms on a rotating basis. All 12 of the Reserve Bank presidents attend FOMC meetings and participate in FOMC discussions, but only the presidents who are Committee members at the time may vote on policy decisions. FOMC meetings typically are held eight times each year in Washington, D.C., and at other times as needed.

How are statements organized?

The Committee releases a public statement immediately after each FOMC meeting. Each statement follows very similar structure. Firstly, the general background of the economic situation is presented. Then the Commitee introduces the value of the established federal funds rate and also share predictions. At the end, there are listed names of people which voted for the FOMC monetary policy action.

Data description

We sourced the data by scraping the statements from the Federal Reserve official website 1 using Python. In the scraping algorithm we limited the content only to FOMC announcment, omitting the names of voters listed in the last paragraph. The analysed period includes years from 2006 to 2018 which resulted in obtaining 107 documents.

# Loading scrapped statements
# DATA_DIR <- "C:/Users/KAndr/OneDrive/Studia/II rok I semestr/Text mining/Text mining project/Statements/"
# DATA_DIR <- "C:/Users/KAndr/OneDrive/Studia/II rok I semestr/Text mining/Text mining project/Statements/"
DATA_DIR <-  "C:/Users/KAndr/OneDrive/Studia/II rok I semestr/Text mining/Text mining project/Statements/"
  # "~/Desktop/FOMC-text-mining/Statements"

fomc_2006 <- readtext(paste0(DATA_DIR, "/2006/*"))
fomc_2007 <- readtext(paste0(DATA_DIR, "/2007/*"))
fomc_2008 <- readtext(paste0(DATA_DIR, "/2008/*"))
fomc_2009 <- readtext(paste0(DATA_DIR, "/2009/*"))
fomc_2010 <- readtext(paste0(DATA_DIR, "/2010/*"))
fomc_2011 <- readtext(paste0(DATA_DIR, "/2011/*"))
fomc_2012 <- readtext(paste0(DATA_DIR, "/2012/*"))
fomc_2013 <- readtext(paste0(DATA_DIR, "/2013/*"))
fomc_2014 <- readtext(paste0(DATA_DIR, "/2014/*"))
fomc_2015 <- readtext(paste0(DATA_DIR, "/2015/*"))
fomc_2016 <- readtext(paste0(DATA_DIR, "/2016/*"))
fomc_2017 <- readtext(paste0(DATA_DIR, "/2017/*"))
fomc_2018 <- readtext(paste0(DATA_DIR, "/2018/*"))
# Binding data
statements <- rbind(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
                    fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)
# Removing files from memory
remove(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
       fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)

Text preparation

We start our work on statments with the initial preprocessing of the dataset. It consists of two columns: doc_id and text. Doc_id is sourced from each statement’s website link. Text is just a content of the statement.

head(statements, 1)
## readtext object consisting of 1 document and 0 docvars.
## # Description: df[,2] [1 x 2]
##   doc_id       text               
## * <chr>        <chr>              
## 1 20060131.txt "\"The Federa\"..."
# adding an unique ID
statements <- statements %>% mutate(ID = 1:n())
# setting column names 
colnames(statements) <- c("Date", "Text", "ID")
# modification of doc_id column - changing it to date column
statements$Date <- gsub(".txt", "", statements$Date)
statements$Date <- as.Date(statements$Date, "%Y%m%d ")
statements_all <- as.vector(statements$Text)
length(statements_all) 
## [1] 107

The next step was concerting the dataset into volatile corpora which is a handful form in the following operations. Below can be seen an example statement before any text preprocessing operations applied.

(corpus_all <- VCorpus(VectorSource(statements_all)))
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 107
inspect(corpus_all[[1]])
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 778
## 
## The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4-1/2 percent. Although recent economic data have been uneven, the expansion in economic activity appears solid. Core inflation has stayed relatively low in recent months and longer-term inflation expectations remain contained. Nevertheless, possible increases in resource utilization as well as elevated energy prices have the potential to add to inflation pressures. The Committee judges that some further policy firming may be needed to keep the risks to the attainment of both sustainable economic growth and price stability roughly in balance. In any event, the Committee will respond to changes in economic prospects as needed to foster these objectives.

Preprocessing

We start preprocessing with text cleaning using tm_map() function. We lower each case, remove words from the built-in stopwords list, we remove punctuation, unnecessary whitespaces and numbers. At the end we apply PlainTextDocument() function.

corpus_clean <- corpus_all %>% 
    tm_map(tolower) %>%
    tm_map(removeWords, stopwords("en")) %>% 
    tm_map(removePunctuation) %>%
    tm_map(stripWhitespace) %>% 
    tm_map(removeNumbers) %>% 
    tm_map(PlainTextDocument)

Below can be seen examples of the statements after above cleaning steps.

as.character(corpus_clean[[1]]) 
## [1] " federal open market committee decided today raise target federal funds rate  basis points  percent although recent economic data uneven expansion economic activity appears solid core inflation stayed relatively low recent months longerterm inflation expectations remain contained nevertheless possible increases resource utilization well elevated energy prices potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives "
# example statement after cleaning
as.character(corpus_clean[[2]]) 
## [1] " federal open market committee decided today raise target federal funds rate  basis points  percent slowing growth real gdp fourth quarter  seems largely reflected temporary special factors economic growth rebounded strongly current quarter appears likely moderate sustainable pace yet run prices energy commodities appears modest effect core inflation ongoing productivity gains helped hold growth unit labor costs check inflation expectations remain contained still possible increases resource utilization combination elevated prices energy commodities potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives"

In order to ease operations on the corpus, we modify it into a data frame.

df_corpus <- data.frame(text = unlist(sapply(corpus_clean, `[`, "content")), stringsAsFactors = F)
df_corpus <- df_corpus %>% mutate(doc_id = 1:n())
df_corpus$text[1]
## [1] " federal open market committee decided today raise target federal funds rate  basis points  percent although recent economic data uneven expansion economic activity appears solid core inflation stayed relatively low recent months longerterm inflation expectations remain contained nevertheless possible increases resource utilization well elevated energy prices potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives "

In the next steps, we append statements data frame with cleaned text. We also count number of words occuring in the original statement and in the cleaned statement.

statements_clean <- statements %>% 
  mutate(cleaned_text = df_corpus$text)
# clened_text
count_cleaned_word <- statements_clean %>%
  unnest_tokens(word_count, cleaned_text) %>%
  count(ID, word_count, sort = T) %>% 
  group_by(ID) %>%
  summarize(word_cleaned_count = sum(n))

statements_clean_count <- left_join(statements_clean, count_cleaned_word, by = 'ID')
count_word <- statements_clean_count %>%
  unnest_tokens(word_count, Text) %>%
  count(ID, word_count, sort = T) %>% 
  group_by(ID) %>% 
  summarize (word_count = sum(n))

statements_final <- left_join(statements_clean_count, count_word, by = 'ID')

Word counts over time

(…)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
library(viridis)
## Loading required package: viridisLite
myplot <- statements_final %>% 
              select(Date, word_count, word_cleaned_count) %>% 
              ggplot() +
              geom_line(aes(x = Date, 
                            y = word_count), 
                        color = viridis(10)[3]) + 
              geom_line(aes(x = Date, 
                            y = word_cleaned_count), 
                        color = viridis(10)[6]) +
              labs(x = "Date", 
                   y = "Number of words", 
                   title = "Comparison of number of words between original and cleaned <br>statements content over time") +
              scale_x_date(date_breaks = "1 year", 
                           date_labels = "%Y") +
              theme_minimal()

ggplotly(myplot)

TF-IDF

(…)

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
statements_words <- statements_clean_count %>%
  mutate(year = year(Date)) %>% 
  unnest_tokens(word_count, cleaned_text) %>%
  count(year, word_count, sort = T)

statements_words
## # A tibble: 4,009 x 3
##     year word_count     n
##    <dbl> <chr>      <int>
##  1  2014 committee    139
##  2  2013 committee    127
##  3  2014 inflation    114
##  4  2015 committee    112
##  5  2017 inflation    104
##  6  2015 inflation     96
##  7  2011 committee     89
##  8  2013 will          89
##  9  2016 inflation     89
## 10  2012 committee     87
## # ... with 3,999 more rows
statements_words <- statements_words %>%
  bind_tf_idf(word_count, year, n)

statements_words
## # A tibble: 4,009 x 6
##     year word_count     n     tf   idf tf_idf
##    <dbl> <chr>      <int>  <dbl> <dbl>  <dbl>
##  1  2014 committee    139 0.0384     0      0
##  2  2013 committee    127 0.0399     0      0
##  3  2014 inflation    114 0.0315     0      0
##  4  2015 committee    112 0.0419     0      0
##  5  2017 inflation    104 0.0440     0      0
##  6  2015 inflation     96 0.0359     0      0
##  7  2011 committee     89 0.0429     0      0
##  8  2013 will          89 0.0280     0      0
##  9  2016 inflation     89 0.0353     0      0
## 10  2012 committee     87 0.0416     0      0
## # ... with 3,999 more rows
pd = statements_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word_count, levels = rev(unique(word_count)))) %>% 
  group_by(year) %>% 
  top_n(10) %>% 
  ungroup() %>%
  arrange(year, tf_idf) %>%
  mutate(order = row_number()) 
## Selecting by word
ggplot(pd, aes(order, tf_idf, fill = year)) +
  geom_bar(show.legend = FALSE, stat = "identity") +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~year, ncol = 3, scales = "free") +
  scale_x_continuous(breaks = pd$order, 
                     labels = pd$word,
                     expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0)) +
  coord_flip() +
  theme_minimal()

Wordclouds

library(wordcloud)
dtm <- TermDocumentMatrix(corpus_clean)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
d <- data.frame(word = names(v), freq=v)
head(d, 10)
##                  word freq
## committee   committee  915
## inflation   inflation  834
## will             will  638
## economic     economic  556
## market         market  446
## federal       federal  445
## rate             rate  418
## labor           labor  331
## conditions conditions  326
## securities securities  314
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=viridis(10))

Associations and network analysis

Sentiment analysis

# Lemmatization
statements_final$lemma_text <- lemmatize_strings(statements_final$cleaned_text)
# Tokenization
tokens <- statements_final %>%
  unnest_tokens(word, lemma_text) 

Topic modelling

ap_top_terms <- ap_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

ap_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

library(viridis)
dupa <- ggplot(statements_topics, aes(x=Date)) + 
  geom_line(aes(y=topic1), fill =viridis(10)[2]) +
  geom_line(aes(y=topic2),  color=viridis(10)[5]) +
  geom_line(aes(y=topic3),  color=viridis(10)[10]) +
  geom_line(aes(y=topic4))
## Warning: Ignoring unknown parameters: fill
ggplotly(dupa)
ggplot(statements_topics) + 
  geom_density( aes(x = topic1, y = ..scaled..), fill = "red", alpha = 0.5) +
  geom_density( aes(x = topic2, y = ..scaled..), fill = "green", alpha = 0.5) +
  geom_density( aes(x = topic3, y = ..scaled..), fill = "blue", alpha = 0.5) +
  geom_density( aes(x = topic4, y = ..scaled..), fill = "yellow", alpha = 0.5)